# ===step 1 Import the Necessary Libraries ===
!pip install wordcloud
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from nltk.probability import FreqDist
from wordcloud import WordCloud

# === Download NLTK Resources ===
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# === Step 2: Data Collection ===
text_data = """Text mining is the process of extracting meaningful information from unstructured text data.
It involves various techniques such as tokenization, stop word removal, and lemmatization.
Text mining, also known as text data mining, is the process of transforming unstructured text
into a structured format to identify meaningful patterns and new insights.
It can use text mining to analyze vast collections of textual materials to capture key concepts,
trends, and hidden relationships."""

print("=== Raw Text Data ===")
print(text_data)

# === Step 3: Text Preprocessing Function ===
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())

    # Remove stop words and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words and token.isalpha()]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return lemmatized_tokens
cleaned_tokens = preprocess_text(text_data)
print("\n=== Cleaned Tokens ===")
print(cleaned_tokens)

# === Step 4: Frequency Distribution ===
fdist = FreqDist(cleaned_tokens)
print("\n=== Frequency Distribution (Top 10 Words) ===")
print(fdist.most_common(10))

# === Step 5: Visualization ===
plt.figure(figsize=(6,3))
fdist.plot(10, cumulative=False, color='blue')
plt.title('Frequency Distribution of Tokens')
plt.show()


# === Step 6: Word Cloud ===
wordcloud = WordCloud(width=1000, height=500, background_color='white', colormap='Greens').generate(text_data)
np.random.seed(12357)
plt.figure(figsize=(8,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud Representation', fontsize=14)
plt.show()





